In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
In [2]:
df = pd.read_csv('D:\DOCUMENTS AND LECTURES\BS ECO\MetroLiving_Data new.csv')
In [3]:
df.head()
Out[3]:
neighborhood property_type units avg_size distance_to_public_transport distance_to_schools distance_to_park recent_sale_price year_built crime_rate
0 Columbia Heights Condominium 21 1571.0 0.67 0.72 0.99 1739228.0 1967 5.7
1 Petworth Townhouse 24 1173.0 0.17 0.80 0.83 NaN 1990 12.1
2 U Street Corridor Single Family Home 32 980.0 0.75 0.66 0.91 701071.0 2011 5.2
3 Adams Morgan Townhouse 14 1241.0 0.41 0.48 0.49 1186984.0 1968 20.7
4 Dupont Circle Townhouse 11 817.0 0.63 0.45 0.58 1869701.0 2015 16.1
In [4]:
print(df.isnull().sum())
neighborhood                      0
property_type                     0
units                             0
avg_size                         13
distance_to_public_transport      0
distance_to_schools               0
distance_to_park                  0
recent_sale_price               238
year_built                        0
crime_rate                        0
dtype: int64
In [5]:
df.dropna(inplace=True)
In [6]:
print(df.isnull().sum())
neighborhood                    0
property_type                   0
units                           0
avg_size                        0
distance_to_public_transport    0
distance_to_schools             0
distance_to_park                0
recent_sale_price               0
year_built                      0
crime_rate                      0
dtype: int64
In [7]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 1750 entries, 0 to 1999
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   neighborhood                  1750 non-null   object 
 1   property_type                 1750 non-null   object 
 2   units                         1750 non-null   int64  
 3   avg_size                      1750 non-null   float64
 4   distance_to_public_transport  1750 non-null   float64
 5   distance_to_schools           1750 non-null   float64
 6   distance_to_park              1750 non-null   float64
 7   recent_sale_price             1750 non-null   float64
 8   year_built                    1750 non-null   int64  
 9   crime_rate                    1750 non-null   float64
dtypes: float64(6), int64(2), object(2)
memory usage: 150.4+ KB
In [8]:
df['proximity_to_public_transport'] = 1 / df['distance_to_public_transport']
df['proximity_to_schools'] = 1 / df['distance_to_schools']
df['proximity_to_park'] = 1 / df['distance_to_park']
In [9]:
df.describe()
Out[9]:
units avg_size distance_to_public_transport distance_to_schools distance_to_park recent_sale_price year_built crime_rate proximity_to_public_transport proximity_to_schools proximity_to_park
count 1750.000000 1750.000000 1750.000000 1750.000000 1750.000000 1.750000e+03 1750.000000 1750.000000 1750.000000 1750.000000 1750.000000
mean 21.557143 1205.936000 0.495354 0.548806 0.538429 1.259086e+06 1985.483429 15.047886 2.822182 2.570763 2.603409
std 11.000436 188.354657 0.235426 0.261646 0.257098 4.286924e+05 20.372082 5.774560 1.973248 1.856316 1.865713
min 3.000000 700.000000 0.100000 0.100000 0.100000 5.004110e+05 1950.000000 5.000000 1.111111 1.000000 1.000000
25% 12.000000 1077.000000 0.290000 0.330000 0.310000 8.974245e+05 1968.000000 9.900000 1.408451 1.298701 1.315789
50% 22.000000 1206.000000 0.490000 0.540000 0.540000 1.246108e+06 1986.000000 15.100000 2.040816 1.851852 1.851852
75% 31.000000 1335.750000 0.710000 0.770000 0.760000 1.640313e+06 2003.750000 20.000000 3.448276 3.030303 3.225806
max 40.000000 1873.000000 0.900000 1.000000 1.000000 1.996792e+06 2020.000000 25.000000 10.000000 10.000000 10.000000
In [10]:
import seaborn as sns
sns.pairplot(df, x_vars=['avg_size', 'proximity_to_public_transport', 'proximity_to_schools' , 'proximity_to_park' , 'year_built', 'crime_rate'], y_vars=['recent_sale_price'], kind='scatter')
plt.show()
D:\PYTHON - ANACONDA\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight
  self._figure.tight_layout(*args, **kwargs)
In [11]:
plt.figure(figsize=(8, 6))
sns.lineplot(x='avg_size', y='recent_sale_price', data=df, ci='sd')
plt.show()
C:\Users\hassa\AppData\Local\Temp\ipykernel_9556\594477631.py:2: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar='sd'` for the same effect.

  sns.lineplot(x='avg_size', y='recent_sale_price', data=df, ci='sd')
In [12]:
plt.figure(figsize=(8, 6))
sns.histplot(data=df, x='avg_size', bins=20, kde=True)
plt.xlabel('Average Size')
plt.ylabel('Frequency')
plt.title('Distribution of Average Size')
plt.show()
In [13]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='neighborhood', y='avg_size')
plt.xticks(rotation=45)
plt.xlabel('Neighborhood')
plt.ylabel('Average Size')
plt.title('Neighborhood vs. Average Size')
plt.show()
In [14]:
plt.figure(figsize=(12, 6))
sns.barplot(data=df, x='neighborhood', y='avg_size')
plt.xticks(rotation=45)
plt.xlabel('Neighborhood')
plt.ylabel('Average Size')
plt.title('Neighborhood vs. Average Size')
plt.show()
In [15]:
plt.figure(figsize=(10, 6))
sns.lineplot(x='year_built', y='recent_sale_price', data=df)
plt.xlabel('Year Built')
plt.ylabel('Recent Sale Price')
plt.title('Recent Sale Price Trend over Year Built')
plt.show()
In [16]:
plt.figure(figsize=(20, 6))
sns.boxplot(x='year_built', y='recent_sale_price', data=df)
plt.xlabel('Year Built')
plt.ylabel('Recent Sale Price')
plt.title('Recent Sale Price Distribution by Year Built')
plt.xticks(rotation=70)
plt.show()
In [17]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='property_type', y='recent_sale_price', data=df)
plt.xlabel('Property Type')
plt.ylabel('Recent Sale Price')
plt.title('Recent Sale Price Distribution by Property Type')
plt.show()
In [20]:
plt.figure(figsize=(10, 6))
sns.lineplot(x='neighborhood', y='recent_sale_price', data=df)
plt.xlabel('Neighborhood')
plt.ylabel('Recent Sale Price')
plt.title('Trend of Recent Sale Prices by Neighborhood')
plt.xticks(rotation=45)  
plt.show()
In [21]:
plt.figure(figsize=(10, 6))
sns.lineplot(x='neighborhood', y='crime_rate', data=df)
plt.xlabel('Neighborhood')
plt.ylabel('Crime Rate')
plt.title('Crime Rate by Neighborhood')
plt.xticks(rotation=45)  
plt.show()
In [22]:
plt.figure(figsize=(8, 6))
sns.scatterplot(x='crime_rate', y='recent_sale_price', data=df)
plt.xlabel('Crime Rate')
plt.ylabel('Recent Sale Price')
plt.title('Crime Rate vs. Recent Sale Price')
plt.show()
In [23]:
plt.figure(figsize=(8, 6))
sns.lineplot(data=df, x='crime_rate', y='recent_sale_price', estimator='mean', ci=None)
plt.xlabel('Crime Rate')
plt.ylabel('Mean Recent Sale Price')
plt.title('Mean Recent Sale Price by Crime Rate')
plt.show()
C:\Users\hassa\AppData\Local\Temp\ipykernel_9556\2750345418.py:2: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.lineplot(data=df, x='crime_rate', y='recent_sale_price', estimator='mean', ci=None)
In [24]:
plt.figure(figsize=(8, 6))
sns.boxplot(data=df, x='property_type', y='crime_rate')
plt.xlabel('Property Type')
plt.ylabel('Crime Rate')
plt.title('Property Type vs. Crime Rate')
plt.show()
In [25]:
plt.figure(figsize=(12, 6))
sns.boxplot(x='neighborhood', y='recent_sale_price', data=df)
plt.xticks(rotation=45)
plt.show()
In [26]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='neighborhood', y='proximity_to_public_transport')
plt.xticks(rotation=45)
plt.xlabel('Neighborhood')
plt.ylabel('Distance to proximity Transport')
plt.title('Neighborhood vs. Proximity to Public Transport')
plt.show()
In [27]:
plt.figure(figsize=(10, 6))
sns.barplot(data=df, x='neighborhood', y='proximity_to_public_transport', estimator='mean', ci=None)
plt.xticks(rotation=45)
plt.xlabel('Neighborhood')
plt.ylabel('Mean Distance to Proximity Transport')
plt.title('Mean Distance to Proximity Transport by Neighborhood')
plt.show()
C:\Users\hassa\AppData\Local\Temp\ipykernel_9556\2995364177.py:2: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(data=df, x='neighborhood', y='proximity_to_public_transport', estimator='mean', ci=None)
In [28]:
plt.figure(figsize=(10, 6))
sns.lineplot(data=df, x='neighborhood', y='proximity_to_public_transport', estimator='mean', ci=None)
plt.xticks(rotation=45)
plt.xlabel('Neighborhood')
plt.ylabel('Mean Distance to Proximity Transport')
plt.title('Mean Distance to Proximity Transport by Neighborhood')
plt.show()
C:\Users\hassa\AppData\Local\Temp\ipykernel_9556\1012449902.py:2: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.lineplot(data=df, x='neighborhood', y='proximity_to_public_transport', estimator='mean', ci=None)
In [29]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='neighborhood', y='proximity_to_schools')
plt.xticks(rotation=45)
plt.xlabel('Neighborhood')
plt.ylabel('proximity to schools')
plt.title('Neighborhood vs. Proximity to schools')
plt.show()
In [30]:
plt.figure(figsize=(10, 6))
sns.barplot(data=df, x='neighborhood', y='proximity_to_schools', estimator='mean', ci=None)
plt.xticks(rotation=45)
plt.xlabel('Neighborhood')
plt.ylabel('Mean Proximity to Schools')
plt.title('Mean Proximity to Schools by Neighborhood')
plt.show()
C:\Users\hassa\AppData\Local\Temp\ipykernel_9556\1011597235.py:2: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(data=df, x='neighborhood', y='proximity_to_schools', estimator='mean', ci=None)
In [31]:
plt.figure(figsize=(10, 6))
sns.lineplot(data=df, x='neighborhood', y='proximity_to_schools', estimator='mean', ci=None)
plt.xticks(rotation=45)
plt.xlabel('Neighborhood')
plt.ylabel('Mean Proximity to Schools')
plt.title('Mean Proximity to Schools by Neighborhood')
plt.show()
C:\Users\hassa\AppData\Local\Temp\ipykernel_9556\1804502330.py:2: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.lineplot(data=df, x='neighborhood', y='proximity_to_schools', estimator='mean', ci=None)
In [32]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='neighborhood', y='proximity_to_park')
plt.xticks(rotation=45)
plt.xlabel('Neighborhood')
plt.ylabel('proximity to Parks')
plt.title('Neighborhood vs. Proximity to Parks')
plt.show()
In [33]:
plt.figure(figsize=(10, 6))
sns.barplot(data=df, x='neighborhood', y='proximity_to_park', estimator='mean', ci=None)
plt.xticks(rotation=45)
plt.xlabel('Neighborhood')
plt.ylabel('Mean Proximity to Parks')
plt.title('Mean Proximity to Parks by Neighborhood')
plt.show()
C:\Users\hassa\AppData\Local\Temp\ipykernel_9556\3761785318.py:2: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(data=df, x='neighborhood', y='proximity_to_park', estimator='mean', ci=None)
In [34]:
plt.figure(figsize=(10, 6))
sns.lineplot(data=df, x='neighborhood', y='proximity_to_park', estimator='mean', ci=None)
plt.xticks(rotation=45)
plt.xlabel('Neighborhood')
plt.ylabel('Mean Proximity to Parks')
plt.title('Mean Proximity to Parks by Neighborhood')
plt.show()
C:\Users\hassa\AppData\Local\Temp\ipykernel_9556\2242482461.py:2: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.lineplot(data=df, x='neighborhood', y='proximity_to_park', estimator='mean', ci=None)
In [35]:
plt.figure(figsize=(12, 6))
sns.barplot(data=df, x='neighborhood', y='proximity_to_public_transport')
plt.xticks(rotation=45)
plt.xlabel('Neighborhood')
plt.ylabel('Proximity to public Transport')
plt.title('Neighborhood vs. Proximity to Public Transport')
plt.show()
In [36]:
plt.figure(figsize=(12, 6))
sns.barplot(data=df, x='neighborhood', y='proximity_to_schools')
plt.xticks(rotation=45)
plt.xlabel('Neighborhood')
plt.ylabel('Proximity to Schools')
plt.title('Neighborhood vs. Proximity to Schools')
plt.show()
In [37]:
plt.figure(figsize=(12, 6))
sns.barplot(data=df, x='neighborhood', y='proximity_to_park')
plt.xticks(rotation=45)
plt.xlabel('Neighborhood')
plt.ylabel('Proximity to Parks')
plt.title('Neighborhood vs. Proximity to Parks')
plt.show()
In [38]:
plt.figure(figsize=(12, 6))
sns.barplot(data=df, x='neighborhood', y='crime_rate')
plt.xticks(rotation=45)
plt.xlabel('Neighborhood')
plt.ylabel('Crime Rate')
plt.title('Neighborhood vs. Crime Rate')
plt.show()
In [42]:
plt.figure(figsize=(8, 6))
sns.histplot(data=df, x='crime_rate', bins=20, kde=True, hue='recent_sale_price', legend=False)
plt.xlabel('Crime Rate')
plt.ylabel('Frequency')
plt.title('Distribution of Recent Sale Price by Crime Rate')
plt.show()
In [40]:
df['crime_rate_bins'] = pd.cut(df['crime_rate'], bins=10)

plt.figure(figsize=(10, 6))
sns.barplot(data=df, x='crime_rate_bins', y='recent_sale_price', ci=None)
plt.xlabel('Crime Rate Bins')
plt.ylabel('Mean Recent Sale Price')
plt.title('Mean Recent Sale Price by Crime Rate Bins')
plt.xticks(rotation=45)
plt.show()
C:\Users\hassa\AppData\Local\Temp\ipykernel_9556\3204610068.py:4: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(data=df, x='crime_rate_bins', y='recent_sale_price', ci=None)
In [41]:
plt.figure(figsize=(10, 6))
sns.lineplot(data=df, x='crime_rate', y='recent_sale_price', estimator='mean', ci=None)
plt.xlabel('Crime Rate')
plt.ylabel('Mean Recent Sale Price')
plt.title('Mean Recent Sale Price by Crime Rate')
plt.show()
C:\Users\hassa\AppData\Local\Temp\ipykernel_9556\3725950289.py:2: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.lineplot(data=df, x='crime_rate', y='recent_sale_price', estimator='mean', ci=None)
In [43]:
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='proximity_to_public_transport', y='recent_sale_price')
plt.xlabel('Proximity to Public Transport')
plt.ylabel('Recent Sale Price')
plt.title('Proximity to Public Transport vs. Recent Sale Price')
plt.show()
In [44]:
df['proximity_bins'] = pd.cut(df['proximity_to_public_transport'], bins=10)

plt.figure(figsize=(10, 6))
sns.barplot(data=df, x='proximity_bins', y='recent_sale_price', ci=None)
plt.xlabel('Proximity to Public Transport (Bins)')
plt.ylabel('Mean Recent Sale Price')
plt.title('Mean Recent Sale Price by Proximity to Public Transport')
plt.xticks(rotation=45)
plt.show()
C:\Users\hassa\AppData\Local\Temp\ipykernel_9556\4223314786.py:4: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(data=df, x='proximity_bins', y='recent_sale_price', ci=None)
In [45]:
plt.figure(figsize=(8, 6))
sns.lineplot(data=df, x='proximity_to_public_transport', y='recent_sale_price', estimator='mean', ci=None)
plt.xlabel('Proximity to Public Transport')
plt.ylabel('Mean Recent Sale Price')
plt.title('Mean Recent Sale Price by Proximity to Public Transport')
plt.show()
C:\Users\hassa\AppData\Local\Temp\ipykernel_9556\1972856151.py:2: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.lineplot(data=df, x='proximity_to_public_transport', y='recent_sale_price', estimator='mean', ci=None)
In [46]:
df['proximity_bins'] = pd.cut(df['proximity_to_park'], bins=10)

plt.figure(figsize=(10, 6))
sns.barplot(data=df, x='proximity_bins', y='recent_sale_price', ci=None)
plt.xlabel('Proximity to Park (Bins)')
plt.ylabel('Mean Recent Sale Price')
plt.title('Mean Recent Sale Price by Proximity to Park')
plt.xticks(rotation=45)
plt.show()
C:\Users\hassa\AppData\Local\Temp\ipykernel_9556\278634783.py:4: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(data=df, x='proximity_bins', y='recent_sale_price', ci=None)
In [47]:
plt.figure(figsize=(8, 6))
sns.lineplot(data=df, x='proximity_to_park', y='recent_sale_price', estimator='mean', ci=None)
plt.xlabel('Proximity to Park')
plt.ylabel('Mean Recent Sale Price')
plt.title('Mean Recent Sale Price by Proximity to Park')
plt.show()
C:\Users\hassa\AppData\Local\Temp\ipykernel_9556\314549666.py:2: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.lineplot(data=df, x='proximity_to_park', y='recent_sale_price', estimator='mean', ci=None)
In [48]:
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='proximity_to_schools', y='recent_sale_price')
plt.xlabel('Proximity to schools')
plt.ylabel('Recent Sale Price')
plt.title('Proximity to Schools vs. Recent Sale Price')
plt.show()
In [49]:
df['proximity_bins'] = pd.cut(df['proximity_to_schools'], bins=10)

plt.figure(figsize=(10, 6))
sns.barplot(data=df, x='proximity_bins', y='recent_sale_price', ci=None)
plt.xlabel('Proximity to Schools (Bins)')
plt.ylabel('Mean Recent Sale Price')
plt.title('Mean Recent Sale Price by Proximity to Schools')
plt.xticks(rotation=45)
plt.show()
C:\Users\hassa\AppData\Local\Temp\ipykernel_9556\2976156522.py:4: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(data=df, x='proximity_bins', y='recent_sale_price', ci=None)
In [50]:
plt.figure(figsize=(8, 6))
sns.lineplot(data=df, x='proximity_to_schools', y='recent_sale_price', estimator='mean', ci=None)
plt.xlabel('Proximity to Schools')
plt.ylabel('Mean Recent Sale Price')
plt.title('Mean Recent Sale Price by Proximity to Schools')
plt.show()
C:\Users\hassa\AppData\Local\Temp\ipykernel_9556\2817001178.py:2: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.lineplot(data=df, x='proximity_to_schools', y='recent_sale_price', estimator='mean', ci=None)
In [53]:
plt.figure(figsize=(10, 6))
sns.lineplot(data=df, x='proximity_to_public_transport', y='recent_sale_price')
plt.title('Recent Sale Price trend with Proximity to Public Transport')
plt.xlabel('Proximity to Public Transport')
plt.ylabel('Recent Sale Price')
plt.show()
In [52]:
correlation_matrix = df[['units' , 'crime_rate' , 'avg_size' , 'year_built' , 'proximity_to_public_transport', 'proximity_to_schools', 'proximity_to_park', 'recent_sale_price']].corr()
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()
In [54]:
correlation_matrix = df[['proximity_to_public_transport', 'proximity_to_schools', 'proximity_to_park', 'recent_sale_price']].corr()
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()
In [56]:
import statsmodels.api as sm
X = df[['units', 'avg_size', 'proximity_to_public_transport', 'proximity_to_schools', 'proximity_to_park' , 'year_built', 'crime_rate']]
y = df['recent_sale_price']
X = sm.add_constant(X)
model_ols = sm.OLS(y, X).fit()
print(model_ols.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:      recent_sale_price   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                 -0.001
Method:                 Least Squares   F-statistic:                    0.6738
Date:                Thu, 18 Apr 2024   Prob (F-statistic):              0.694
Time:                        04:01:17   Log-Likelihood:                -25175.
No. Observations:                1750   AIC:                         5.037e+04
Df Residuals:                    1742   BIC:                         5.041e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
=================================================================================================
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
const                          2.469e+06      1e+06      2.461      0.014    5.01e+05    4.44e+06
units                           326.9055    933.424      0.350      0.726   -1503.845    2157.656
avg_size                         -9.7953     54.461     -0.180      0.857    -116.612      97.021
proximity_to_public_transport -6244.0916   5201.767     -1.200      0.230   -1.64e+04    3958.274
proximity_to_schools          -2092.6498   5531.818     -0.378      0.705   -1.29e+04    8757.054
proximity_to_park             -6076.8969   5510.253     -1.103      0.270   -1.69e+04    4730.510
year_built                     -597.6282    503.675     -1.187      0.236   -1585.499     390.242
crime_rate                     1348.4206   1780.008      0.758      0.449   -2142.756    4839.597
==============================================================================
Omnibus:                     1410.383   Durbin-Watson:                   2.010
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              105.347
Skew:                          -0.001   Prob(JB):                     1.33e-23
Kurtosis:                       1.798   Cond. No.                     2.27e+05
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.27e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
In [57]:
df['interaction_term'] = df['avg_size'] * df['crime_rate']


X = df[['avg_size', 'crime_rate', 'interaction_term']]  
X = sm.add_constant(X) 
y = df['recent_sale_price'] 

model = sm.OLS(y, X).fit()


print(model.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:      recent_sale_price   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                 -0.001
Method:                 Least Squares   F-statistic:                    0.3223
Date:                Thu, 18 Apr 2024   Prob (F-statistic):              0.809
Time:                        04:01:58   Log-Likelihood:                -25177.
No. Observations:                1750   AIC:                         5.036e+04
Df Residuals:                    1746   BIC:                         5.038e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
====================================================================================
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const             1.364e+06   1.81e+05      7.541      0.000    1.01e+06    1.72e+06
avg_size          -101.8587    147.344     -0.691      0.489    -390.848     187.131
crime_rate       -6299.6834   1.14e+04     -0.554      0.579   -2.86e+04     1.6e+04
interaction_term     6.2083      9.245      0.672      0.502     -11.924      24.341
==============================================================================
Omnibus:                     1440.189   Durbin-Watson:                   2.003
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              105.774
Skew:                           0.001   Prob(JB):                     1.08e-23
Kurtosis:                       1.796   Cond. No.                     3.49e+05
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.49e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
In [58]:
df['interaction_term'] = df['avg_size'] * df['units'] 


X = df[['avg_size', 'units', 'interaction_term']] 
X = sm.add_constant(X) 
y = df['recent_sale_price'] 


model = sm.OLS(y, X).fit()


print(model.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:      recent_sale_price   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                 -0.001
Method:                 Least Squares   F-statistic:                    0.5358
Date:                Thu, 18 Apr 2024   Prob (F-statistic):              0.658
Time:                        04:02:11   Log-Likelihood:                -25177.
No. Observations:                1750   AIC:                         5.036e+04
Df Residuals:                    1746   BIC:                         5.038e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
====================================================================================
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const             1.424e+06   1.49e+05      9.557      0.000    1.13e+06    1.72e+06
avg_size          -141.3157    121.528     -1.163      0.245    -379.670      97.039
units            -7028.1015   6114.061     -1.149      0.251    -1.9e+04    4963.551
interaction_term     6.0498      4.988      1.213      0.225      -3.733      15.832
==============================================================================
Omnibus:                     1376.251   Durbin-Watson:                   2.004
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              104.844
Skew:                           0.001   Prob(JB):                     1.71e-23
Kurtosis:                       1.801   Cond. No.                     4.30e+05
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.3e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
In [59]:
df['interaction_term'] = df['avg_size'] * df['proximity_to_public_transport']  


X = df[['avg_size', 'proximity_to_public_transport', 'interaction_term']]  
X = sm.add_constant(X) 
y = df['recent_sale_price'] 


model = sm.OLS(y, X).fit()


print(model.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:      recent_sale_price   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                  0.001
Method:                 Least Squares   F-statistic:                     1.519
Date:                Thu, 18 Apr 2024   Prob (F-statistic):              0.208
Time:                        04:02:23   Log-Likelihood:                -25175.
No. Observations:                1750   AIC:                         5.036e+04
Df Residuals:                    1746   BIC:                         5.038e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
=================================================================================================
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
const                           1.12e+06   1.16e+05      9.621      0.000    8.92e+05    1.35e+06
avg_size                        129.0879     95.263      1.355      0.176     -57.754     315.930
proximity_to_public_transport  5.222e+04   3.33e+04      1.569      0.117   -1.31e+04    1.18e+05
interaction_term                -48.2382     27.194     -1.774      0.076    -101.575       5.098
==============================================================================
Omnibus:                     1263.443   Durbin-Watson:                   2.006
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              103.046
Skew:                           0.004   Prob(JB):                     4.21e-23
Kurtosis:                       1.811   Cond. No.                     5.06e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 5.06e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
In [60]:
df['interaction_term'] = df['avg_size'] * df['proximity_to_schools'] 


X = df[['avg_size', 'proximity_to_schools', 'interaction_term']]  
X = sm.add_constant(X)  
y = df['recent_sale_price']  

model = sm.OLS(y, X).fit()


print(model.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:      recent_sale_price   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.002
Method:                 Least Squares   F-statistic:                   0.06114
Date:                Thu, 18 Apr 2024   Prob (F-statistic):              0.980
Time:                        04:02:33   Log-Likelihood:                -25177.
No. Observations:                1750   AIC:                         5.036e+04
Df Residuals:                    1746   BIC:                         5.038e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
========================================================================================
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                 1.279e+06   1.13e+05     11.309      0.000    1.06e+06     1.5e+06
avg_size               -11.6761     92.370     -0.126      0.899    -192.845     169.492
proximity_to_schools -3006.7583   3.35e+04     -0.090      0.928   -6.87e+04    6.27e+04
interaction_term         0.7093     27.288      0.026      0.979     -52.812      54.230
==============================================================================
Omnibus:                     1403.697   Durbin-Watson:                   2.005
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              105.250
Skew:                           0.000   Prob(JB):                     1.40e-23
Kurtosis:                       1.799   Cond. No.                     4.55e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.55e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
In [79]:
df['interaction_term'] = df['avg_size'] * df['proximity_to_park']  

X = df[['avg_size', 'proximity_to_park', 'interaction_term']]  
X = sm.add_constant(X) 
y = df['recent_sale_price']  


model = sm.OLS(y, X).fit()


print(model.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:      recent_sale_price   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.7377
Date:                Thu, 18 Apr 2024   Prob (F-statistic):              0.530
Time:                        04:33:05   Log-Likelihood:                -25176.
No. Observations:                1750   AIC:                         5.036e+04
Df Residuals:                    1746   BIC:                         5.038e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
=====================================================================================
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const               1.19e+06   1.12e+05     10.610      0.000     9.7e+05    1.41e+06
avg_size             69.3588     91.551      0.758      0.449    -110.202     248.920
proximity_to_park  3.054e+04    3.4e+04      0.899      0.369   -3.61e+04    9.72e+04
interaction_term    -29.8176     27.696     -1.077      0.282     -84.138      24.503
==============================================================================
Omnibus:                     1412.032   Durbin-Watson:                   2.006
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              105.371
Skew:                           0.000   Prob(JB):                     1.31e-23
Kurtosis:                       1.798   Cond. No.                     4.55e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.55e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
In [64]:
print("Data type of y:", y.dtype)
print("Data type of X:")
print(X.dtypes)
Data type of y: float64
Data type of X:
const                                float64
units                                  int64
avg_size                             float64
proximity_to_public_transport        float64
proximity_to_schools                 float64
proximity_to_park                    float64
year_built                             int64
crime_rate                           float64
neighborhood_Anacostia                  bool
neighborhood_Brookland                  bool
neighborhood_Capitol Hill               bool
neighborhood_Columbia Heights           bool
neighborhood_Congress Heights           bool
neighborhood_Deanwood                   bool
neighborhood_Dupont Circle              bool
neighborhood_Foggy Bottom               bool
neighborhood_Georgetown                 bool
neighborhood_Kingman Park               bool
neighborhood_Logan Circle               bool
neighborhood_Mount Pleasant             bool
neighborhood_Petworth                   bool
neighborhood_Shaw                       bool
neighborhood_Southwest Waterfront       bool
neighborhood_Trinidad                   bool
neighborhood_U Street Corridor          bool
neighborhood_Van Ness                   bool
neighborhood_Woodley Park               bool
property_type_Single Family Home        bool
property_type_Townhouse                 bool
dtype: object
In [65]:
for col in X.columns[X.dtypes == 'bool']:
    X[col] = X[col].astype(int)
In [77]:
print(X.dtypes)

X = X.apply(pd.to_numeric, errors='coerce')


print(X.dtypes)
const                                float64
units                                  int64
avg_size                             float64
distance_to_public_transport         float64
distance_to_schools                  float64
distance_to_park                     float64
year_built                             int64
crime_rate                           float64
proximity_to_public_transport        float64
proximity_to_schools                 float64
proximity_to_park                    float64
interaction_term                     float64
neighborhood_Anacostia                  bool
neighborhood_Brookland                  bool
neighborhood_Capitol Hill               bool
neighborhood_Columbia Heights           bool
neighborhood_Congress Heights           bool
neighborhood_Deanwood                   bool
neighborhood_Dupont Circle              bool
neighborhood_Foggy Bottom               bool
neighborhood_Georgetown                 bool
neighborhood_Kingman Park               bool
neighborhood_Logan Circle               bool
neighborhood_Mount Pleasant             bool
neighborhood_Petworth                   bool
neighborhood_Shaw                       bool
neighborhood_Southwest Waterfront       bool
neighborhood_Trinidad                   bool
neighborhood_U Street Corridor          bool
neighborhood_Van Ness                   bool
neighborhood_Woodley Park               bool
property_type_Single Family Home        bool
property_type_Townhouse                 bool
neighborhood_Adams Morgan               bool
neighborhood_Anacostia                  bool
neighborhood_Brookland                  bool
neighborhood_Capitol Hill               bool
neighborhood_Columbia Heights           bool
neighborhood_Congress Heights           bool
neighborhood_Deanwood                   bool
neighborhood_Dupont Circle              bool
neighborhood_Foggy Bottom               bool
neighborhood_Georgetown                 bool
neighborhood_Kingman Park               bool
neighborhood_Logan Circle               bool
neighborhood_Mount Pleasant             bool
neighborhood_Petworth                   bool
neighborhood_Shaw                       bool
neighborhood_Southwest Waterfront       bool
neighborhood_Trinidad                   bool
neighborhood_U Street Corridor          bool
neighborhood_Van Ness                   bool
neighborhood_Woodley Park               bool
property_type_Condominium               bool
property_type_Single Family Home        bool
property_type_Townhouse                 bool
dtype: object
const                                float64
units                                  int64
avg_size                             float64
distance_to_public_transport         float64
distance_to_schools                  float64
distance_to_park                     float64
year_built                             int64
crime_rate                           float64
proximity_to_public_transport        float64
proximity_to_schools                 float64
proximity_to_park                    float64
interaction_term                     float64
neighborhood_Anacostia                  bool
neighborhood_Brookland                  bool
neighborhood_Capitol Hill               bool
neighborhood_Columbia Heights           bool
neighborhood_Congress Heights           bool
neighborhood_Deanwood                   bool
neighborhood_Dupont Circle              bool
neighborhood_Foggy Bottom               bool
neighborhood_Georgetown                 bool
neighborhood_Kingman Park               bool
neighborhood_Logan Circle               bool
neighborhood_Mount Pleasant             bool
neighborhood_Petworth                   bool
neighborhood_Shaw                       bool
neighborhood_Southwest Waterfront       bool
neighborhood_Trinidad                   bool
neighborhood_U Street Corridor          bool
neighborhood_Van Ness                   bool
neighborhood_Woodley Park               bool
property_type_Single Family Home        bool
property_type_Townhouse                 bool
neighborhood_Adams Morgan               bool
neighborhood_Anacostia                  bool
neighborhood_Brookland                  bool
neighborhood_Capitol Hill               bool
neighborhood_Columbia Heights           bool
neighborhood_Congress Heights           bool
neighborhood_Deanwood                   bool
neighborhood_Dupont Circle              bool
neighborhood_Foggy Bottom               bool
neighborhood_Georgetown                 bool
neighborhood_Kingman Park               bool
neighborhood_Logan Circle               bool
neighborhood_Mount Pleasant             bool
neighborhood_Petworth                   bool
neighborhood_Shaw                       bool
neighborhood_Southwest Waterfront       bool
neighborhood_Trinidad                   bool
neighborhood_U Street Corridor          bool
neighborhood_Van Ness                   bool
neighborhood_Woodley Park               bool
property_type_Condominium               bool
property_type_Single Family Home        bool
property_type_Townhouse                 bool
dtype: object
In [78]:
X = X.astype(int)


ols_model = sm.OLS(y, X)
ols_results = ols_model.fit()


print(ols_results.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:      recent_sale_price   R-squared:                       0.013
Model:                            OLS   Adj. R-squared:                 -0.005
Method:                 Least Squares   F-statistic:                    0.7239
Date:                Thu, 18 Apr 2024   Prob (F-statistic):              0.867
Time:                        04:32:07   Log-Likelihood:                -25166.
No. Observations:                1750   AIC:                         5.040e+04
Df Residuals:                    1718   BIC:                         5.057e+04
Df Model:                          31                                         
Covariance Type:            nonrobust                                         
=====================================================================================================
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const                              1.559e+06   6.33e+05      2.463      0.014    3.18e+05     2.8e+06
units                               462.2484    939.565      0.492      0.623   -1380.564    2305.060
avg_size                             15.6411     78.580      0.199      0.842    -138.482     169.764
distance_to_public_transport       2.649e-06   1.56e-06      1.701      0.089   -4.05e-07     5.7e-06
distance_to_schools                 -1.2e+04   1.21e+05     -0.100      0.921   -2.48e+05    2.24e+05
distance_to_park                  -1.227e+04   1.45e+05     -0.084      0.933   -2.97e+05    2.73e+05
year_built                         -619.6622    506.499     -1.223      0.221   -1613.082     373.758
crime_rate                         1118.4326   1795.468      0.623      0.533   -2403.102    4639.967
proximity_to_public_transport     -7317.3102   5180.094     -1.413      0.158   -1.75e+04    2842.646
proximity_to_schools              -1894.1038   5615.532     -0.337      0.736   -1.29e+04    9119.897
proximity_to_park                  7083.3340   2.58e+04      0.275      0.783   -4.34e+04    5.76e+04
interaction_term                    -11.8283     20.970     -0.564      0.573     -52.958      29.301
neighborhood_Anacostia             7.042e+04   3.76e+04      1.871      0.062   -3402.255    1.44e+05
neighborhood_Brookland             8.946e+04   3.79e+04      2.362      0.018    1.52e+04    1.64e+05
neighborhood_Capitol Hill           9.25e+04   3.85e+04      2.401      0.016    1.69e+04    1.68e+05
neighborhood_Columbia Heights      1.041e+05   3.73e+04      2.794      0.005     3.1e+04    1.77e+05
neighborhood_Congress Heights      8.953e+04   3.67e+04      2.436      0.015    1.74e+04    1.62e+05
neighborhood_Deanwood              6.126e+04   3.86e+04      1.586      0.113   -1.45e+04    1.37e+05
neighborhood_Dupont Circle         9.949e+04   3.84e+04      2.589      0.010    2.41e+04    1.75e+05
neighborhood_Foggy Bottom          7.555e+04   3.92e+04      1.929      0.054   -1265.503    1.52e+05
neighborhood_Georgetown            7.347e+04   3.89e+04      1.890      0.059   -2753.734     1.5e+05
neighborhood_Kingman Park          2.047e+04   3.82e+04      0.535      0.592   -5.45e+04    9.55e+04
neighborhood_Logan Circle          7.073e+04   3.69e+04      1.916      0.056   -1682.959    1.43e+05
neighborhood_Mount Pleasant        9.044e+04   3.73e+04      2.426      0.015    1.73e+04    1.64e+05
neighborhood_Petworth              5.965e+04   3.67e+04      1.627      0.104   -1.23e+04    1.32e+05
neighborhood_Shaw                  5.314e+04   3.62e+04      1.470      0.142   -1.78e+04    1.24e+05
neighborhood_Southwest Waterfront  7.864e+04   3.77e+04      2.087      0.037    4730.477    1.53e+05
neighborhood_Trinidad               6.43e+04   3.77e+04      1.706      0.088   -9635.341    1.38e+05
neighborhood_U Street Corridor     3.733e+04   3.93e+04      0.949      0.343   -3.98e+04    1.14e+05
neighborhood_Van Ness              9.577e+04   3.73e+04      2.568      0.010    2.26e+04    1.69e+05
neighborhood_Woodley Park          9.277e+04   3.72e+04      2.491      0.013    1.97e+04    1.66e+05
property_type_Single Family Home   3.937e+05   1.58e+05      2.484      0.013    8.28e+04    7.04e+05
property_type_Townhouse            3.801e+05   1.59e+05      2.396      0.017    6.89e+04    6.91e+05
neighborhood_Adams Morgan          1.404e+05   7.27e+04      1.932      0.053   -2114.381    2.83e+05
neighborhood_Anacostia             7.042e+04   3.76e+04      1.871      0.062   -3402.255    1.44e+05
neighborhood_Brookland             8.946e+04   3.79e+04      2.362      0.018    1.52e+04    1.64e+05
neighborhood_Capitol Hill           9.25e+04   3.85e+04      2.401      0.016    1.69e+04    1.68e+05
neighborhood_Columbia Heights      1.041e+05   3.73e+04      2.794      0.005     3.1e+04    1.77e+05
neighborhood_Congress Heights      8.953e+04   3.67e+04      2.436      0.015    1.74e+04    1.62e+05
neighborhood_Deanwood              6.126e+04   3.86e+04      1.586      0.113   -1.45e+04    1.37e+05
neighborhood_Dupont Circle         9.949e+04   3.84e+04      2.589      0.010    2.41e+04    1.75e+05
neighborhood_Foggy Bottom          7.555e+04   3.92e+04      1.929      0.054   -1265.503    1.52e+05
neighborhood_Georgetown            7.347e+04   3.89e+04      1.890      0.059   -2753.734     1.5e+05
neighborhood_Kingman Park          2.047e+04   3.82e+04      0.535      0.592   -5.45e+04    9.55e+04
neighborhood_Logan Circle          7.073e+04   3.69e+04      1.916      0.056   -1682.959    1.43e+05
neighborhood_Mount Pleasant        9.044e+04   3.73e+04      2.426      0.015    1.73e+04    1.64e+05
neighborhood_Petworth              5.965e+04   3.67e+04      1.627      0.104   -1.23e+04    1.32e+05
neighborhood_Shaw                  5.314e+04   3.62e+04      1.470      0.142   -1.78e+04    1.24e+05
neighborhood_Southwest Waterfront  7.864e+04   3.77e+04      2.087      0.037    4730.477    1.53e+05
neighborhood_Trinidad               6.43e+04   3.77e+04      1.706      0.088   -9635.341    1.38e+05
neighborhood_U Street Corridor     3.733e+04   3.93e+04      0.949      0.343   -3.98e+04    1.14e+05
neighborhood_Van Ness              9.577e+04   3.73e+04      2.568      0.010    2.26e+04    1.69e+05
neighborhood_Woodley Park          9.277e+04   3.72e+04      2.491      0.013    1.97e+04    1.66e+05
property_type_Condominium          7.857e+05   3.17e+05      2.482      0.013    1.65e+05    1.41e+06
property_type_Single Family Home   3.937e+05   1.58e+05      2.484      0.013    8.28e+04    7.04e+05
property_type_Townhouse            3.801e+05   1.59e+05      2.396      0.017    6.89e+04    6.91e+05
==============================================================================
Omnibus:                     1333.795   Durbin-Watson:                   2.003
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              104.205
Skew:                          -0.007   Prob(JB):                     2.36e-23
Kurtosis:                       1.805   Cond. No.                     1.65e+17
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.23e-24. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
In [87]:
from sklearn.linear_model import LinearRegression

X = df[['units', 'avg_size', 'proximity_to_public_transport', 'proximity_to_schools', 'proximity_to_park' , 'year_built', 'crime_rate']]
y = df['recent_sale_price']


model_multiple_linear_regression = LinearRegression()
model_multiple_linear_regression.fit(X, y)


print('Intercept:', model_multiple_linear_regression.intercept_)
print('Coefficients:', model_multiple_linear_regression.coef_)

from sklearn.metrics import r2_score


predictions = model_multiple_linear_regression.predict(X)


r_squared = r2_score(y, predictions)


print('R-squared:', r_squared)


from sklearn.metrics import mean_absolute_error, mean_squared_error


mae = mean_absolute_error(y, predictions)


mse = mean_squared_error(y, predictions)


print('Mean Absolute Error:', mae)
print('Mean Squared Error:', mse)
Intercept: 2468963.6708604773
Coefficients: [  326.90551452    -9.79526605 -6244.09158944 -2092.64983014
 -6076.8968629   -597.62820523  1348.42057411]
R-squared: 0.002700395453252291
Mean Absolute Error: 371330.1344120291
Mean Squared Error: 183176186226.81674
In [88]:
from sklearn.neural_network import MLPRegressor


X = df[['units', 'avg_size', 'proximity_to_public_transport', 'proximity_to_schools', 'proximity_to_park' , 'year_built', 'crime_rate']]
y = df['recent_sale_price']


model_neural_network = MLPRegressor()
model_neural_network.fit(X, y)


predictions = model_neural_network.predict(X)


print('Predictions:', predictions[:5])


from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


mae = mean_absolute_error(y, predictions)
mse = mean_squared_error(y, predictions)
r2 = r2_score(y, predictions)


print('Mean Absolute Error:', mae)
print('Mean Squared Error:', mse)
print('R-squared:', r2)
Predictions: [989410.00098055 842980.06261391 902335.02376661 797199.59631309
 905213.46671496]
Mean Absolute Error: 458895.696446196
Mean Squared Error: 316657661688.8813
R-squared: -0.7240372085702156
D:\PYTHON - ANACONDA\Lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:691: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
  warnings.warn(
In [98]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns


numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


ridge_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('ridge', Ridge(alpha=1.0)) 
])


ridge_pipeline.fit(X_train, y_train)


ridge_coefs = ridge_pipeline.named_steps['ridge'].coef_


numeric_feature_names = preprocessor.transformers_[0][2]
categorical_feature_names = ridge_pipeline.named_steps['preprocessor'] \
                                    .named_transformers_['cat'] \
                                    .named_steps['onehot'] \
                                    .get_feature_names_out(input_features=categorical_features)


feature_names = np.concatenate([numeric_feature_names, categorical_feature_names])


coefficients = pd.DataFrame({'Feature': feature_names, 'Coefficient': ridge_coefs})
print(coefficients)
                              Feature   Coefficient
0                               units   4241.587901
1                            avg_size   5087.017658
2        distance_to_public_transport -19992.318008
3                 distance_to_schools  -8438.280193
4                    distance_to_park   5562.966475
5                          year_built  -6420.145866
6                          crime_rate    431.052182
7       proximity_to_public_transport -30555.963268
8                proximity_to_schools -14969.563216
9                   proximity_to_park  -6007.061202
10          neighborhood_Adams Morgan -24176.982979
11             neighborhood_Anacostia -18099.125855
12             neighborhood_Brookland   8338.023897
13          neighborhood_Capitol Hill  33032.630395
14      neighborhood_Columbia Heights  61656.048947
15      neighborhood_Congress Heights  26871.035106
16              neighborhood_Deanwood  -9938.949770
17         neighborhood_Dupont Circle  35007.568600
18          neighborhood_Foggy Bottom  10845.000565
19            neighborhood_Georgetown -16154.167371
20          neighborhood_Kingman Park -88380.606271
21          neighborhood_Logan Circle  29051.133741
22        neighborhood_Mount Pleasant  53640.159890
23              neighborhood_Petworth -36925.707000
24                  neighborhood_Shaw -28395.486934
25  neighborhood_Southwest Waterfront  20990.691595
26              neighborhood_Trinidad -23594.255931
27     neighborhood_U Street Corridor -90271.855266
28              neighborhood_Van Ness  28253.108643
29          neighborhood_Woodley Park  28251.735996
30          property_type_Condominium   7390.384523
31   property_type_Single Family Home  15741.964156
32            property_type_Townhouse -23132.348679
In [92]:
print("Ridge Regression Coefficients:")
print(ridge_pipeline.named_steps['ridge'].coef_)


print("\nRidge Regression Intercept:")
print(ridge_pipeline.named_steps['ridge'].intercept_)
Ridge Regression Coefficients:
[  4241.58790097   5087.01765842 -19992.31800829  -8438.28019253
   5562.9664754   -6420.14586566    431.05218157 -30555.96326826
 -14969.56321551  -6007.06120248 -24176.98297856 -18099.12585456
   8338.02389688  33032.63039521  61656.04894667  26871.03510551
  -9938.94977046  35007.56859982  10845.00056538 -16154.16737124
 -88380.6062706   29051.13374132  53640.15989048 -36925.70700003
 -28395.48693403  20990.69159521 -23594.25593098 -90271.85526579
  28253.10864335  28251.73599636   7390.38452337  15741.96415598
 -23132.34867934]

Ridge Regression Intercept:
1254497.3264722212
In [93]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error


y_train_pred = ridge_pipeline.predict(X_train)
y_test_pred = ridge_pipeline.predict(X_test)


train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)


train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)


train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)


print("Training R-squared:", train_r2)
print("Testing R-squared:", test_r2)
print("\nTraining Mean Squared Error (MSE):", train_mse)
print("Testing Mean Squared Error (MSE):", test_mse)
print("\nTraining Mean Absolute Error (MAE):", train_mae)
print("Testing Mean Absolute Error (MAE):", test_mae)
Training R-squared: 0.01480160617945514
Testing R-squared: -0.019085432492405463

Training Mean Squared Error (MSE): 160531316961.72372
Testing Mean Squared Error (MSE): 159733095775.36615

Training Mean Absolute Error (MAE): 330501.5824404619
Testing Mean Absolute Error (MAE): 332631.973222693
In [94]:
from sklearn.linear_model import Lasso


lasso_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('lasso', Lasso(alpha=1.0))  
])


lasso_pipeline.fit(X_train, y_train)
D:\PYTHON - ANACONDA\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:628: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 4.439e+12, tolerance: 2.607e+10
  model = cd_fast.enet_coordinate_descent(
Out[94]:
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  Index(['units', 'avg_size', 'distance_to_public_transport',
       'distance_to_schools', 'distance_to_park', 'year_built', 'crime_rate',
       'proximity_to_public_transport', 'proximity_to_schools',
       'proximity_to_park'],
      dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  Index(['neighborhood', 'property_type'], dtype='object'))])),
                ('lasso', Lasso())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  Index(['units', 'avg_size', 'distance_to_public_transport',
       'distance_to_schools', 'distance_to_park', 'year_built', 'crime_rate',
       'proximity_to_public_transport', 'proximity_to_schools',
       'proximity_to_park'],
      dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  Index(['neighborhood', 'property_type'], dtype='object'))])),
                ('lasso', Lasso())])
ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('scaler', StandardScaler())]),
                                 Index(['units', 'avg_size', 'distance_to_public_transport',
       'distance_to_schools', 'distance_to_park', 'year_built', 'crime_rate',
       'proximity_to_public_transport', 'proximity_to_schools',
       'proximity_to_park'],
      dtype='object')),
                                ('cat',
                                 Pipeline(steps=[('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 Index(['neighborhood', 'property_type'], dtype='object'))])
Index(['units', 'avg_size', 'distance_to_public_transport',
       'distance_to_schools', 'distance_to_park', 'year_built', 'crime_rate',
       'proximity_to_public_transport', 'proximity_to_schools',
       'proximity_to_park'],
      dtype='object')
StandardScaler()
Index(['neighborhood', 'property_type'], dtype='object')
OneHotEncoder(handle_unknown='ignore')
Lasso()
In [104]:
processed_columns = list(lasso_pipeline.named_steps['preprocessor']
                         .named_transformers_['cat']
                         .named_steps['onehot']
                         .get_feature_names_out(categorical_features))


all_feature_names = list(numeric_features) + processed_columns


lasso_coefficients = lasso_pipeline.named_steps['lasso'].coef_


coefficients_df = pd.DataFrame({'Feature': all_feature_names, 'Coefficient': lasso_coefficients})


print(coefficients_df.sort_values(by='Coefficient', key=abs, ascending=False))


train_preds = lasso_pipeline.predict(X_train)


test_preds = lasso_pipeline.predict(X_test)


from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error


train_r2 = r2_score(y_train, train_preds)
print("Training R-squared:", train_r2)


test_r2 = r2_score(y_test, test_preds)
print("Testing R-squared:", test_r2)


train_mse = mean_squared_error(y_train, train_preds)
print("Training Mean Squared Error (MSE):", train_mse)


test_mse = mean_squared_error(y_test, test_preds)
print("Testing Mean Squared Error (MSE):", test_mse)


train_mae = mean_absolute_error(y_train, train_preds)
print("Training Mean Absolute Error (MAE):", train_mae)


test_mae = mean_absolute_error(y_test, test_preds)
print("Testing Mean Absolute Error (MAE):", test_mae)
                              Feature   Coefficient
27     neighborhood_U Street Corridor -93494.465789
20          neighborhood_Kingman Park -91339.212019
14      neighborhood_Columbia Heights  60537.678310
22        neighborhood_Mount Pleasant  52370.775665
23              neighborhood_Petworth -39239.571033
17         neighborhood_Dupont Circle  33587.745014
13          neighborhood_Capitol Hill  31645.224161
7       proximity_to_public_transport -30692.676341
24                  neighborhood_Shaw -30562.598600
21          neighborhood_Logan Circle  27496.733445
28              neighborhood_Van Ness  26711.399089
29          neighborhood_Woodley Park  26702.848197
10          neighborhood_Adams Morgan -26307.190419
26              neighborhood_Trinidad -25772.325956
15      neighborhood_Congress Heights  25219.638622
31   property_type_Single Family Home  22639.640280
11             neighborhood_Anacostia -20168.851738
2        distance_to_public_transport -20124.552823
25  neighborhood_Southwest Waterfront  19343.145506
19            neighborhood_Georgetown -18280.968585
32            property_type_Townhouse -16313.562083
8                proximity_to_schools -15022.633723
30          property_type_Condominium  14222.921766
16              neighborhood_Deanwood -11941.103919
18          neighborhood_Foggy Bottom   9110.878070
3                 distance_to_schools  -8496.931232
12             neighborhood_Brookland   6542.955689
5                          year_built  -6429.707350
9                   proximity_to_park  -6011.485785
4                    distance_to_park   5582.990966
1                            avg_size   5081.627157
0                               units   4259.733619
6                          crime_rate    417.882821
Training R-squared: 0.014803292845458782
Testing R-squared: -0.019340358961843274
Training Mean Squared Error (MSE): 160531042131.07382
Testing Mean Squared Error (MSE): 159773053361.70786
Training Mean Absolute Error (MAE): 330538.84752715856
Testing Mean Absolute Error (MAE): 332704.4666508229
In [102]:
from sklearn.ensemble import RandomForestRegressor


rfr_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('random_forest', RandomForestRegressor(n_estimators=100, random_state=42))  
])


rfr_pipeline.fit(X_train, y_train)


y_pred_rfr = rfr_pipeline.predict(X_test)


processed_columns = list(rfr_pipeline.named_steps['preprocessor']
                         .named_transformers_['cat']
                         .named_steps['onehot']
                         .get_feature_names_out(categorical_features))


all_feature_names = list(numeric_features) + processed_columns


feature_importances = rfr_pipeline.named_steps['random_forest'].feature_importances_


importance_df = pd.DataFrame({'Feature': all_feature_names, 'Importance': feature_importances})


print(importance_df.sort_values(by='Importance', ascending=False))




mse_rfr = mean_squared_error(y_test, y_pred_rfr)
r2_rfr = r2_score(y_test, y_pred_rfr)

print("Mean Squared Error (MSE) - RFR:", mse_rfr)
print("R-squared - RFR:", r2_rfr)
                              Feature  Importance
1                            avg_size    0.130950
6                          crime_rate    0.125512
5                          year_built    0.107460
0                               units    0.096296
9                   proximity_to_park    0.061457
3                 distance_to_schools    0.061268
8                proximity_to_schools    0.061208
4                    distance_to_park    0.061193
2        distance_to_public_transport    0.061118
7       proximity_to_public_transport    0.060920
31   property_type_Single Family Home    0.012846
30          property_type_Condominium    0.012149
21          neighborhood_Logan Circle    0.010906
32            property_type_Townhouse    0.010688
26              neighborhood_Trinidad    0.008974
15      neighborhood_Congress Heights    0.008879
11             neighborhood_Anacostia    0.008799
20          neighborhood_Kingman Park    0.007731
28              neighborhood_Van Ness    0.007354
22        neighborhood_Mount Pleasant    0.007295
23              neighborhood_Petworth    0.006862
24                  neighborhood_Shaw    0.006665
14      neighborhood_Columbia Heights    0.006662
12             neighborhood_Brookland    0.006249
27     neighborhood_U Street Corridor    0.006221
29          neighborhood_Woodley Park    0.006033
25  neighborhood_Southwest Waterfront    0.005926
10          neighborhood_Adams Morgan    0.005871
18          neighborhood_Foggy Bottom    0.005533
17         neighborhood_Dupont Circle    0.005456
16              neighborhood_Deanwood    0.005394
19            neighborhood_Georgetown    0.005344
13          neighborhood_Capitol Hill    0.004780
Mean Squared Error (MSE) - RFR: 160201542787.94522
R-squared - RFR: -0.022074090065819618
In [103]:
plt.figure(figsize=(10, 6))
plt.barh(feature_names, feature_importances)
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Feature Importances')
plt.show()
In [ ]: